In [9]:
!pip install pandas openpyxl
Requirement already satisfied: pandas in d:\python\python313\lib\site-packages (2.3.1)
Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Requirement already satisfied: numpy>=1.26.0 in d:\python\python313\lib\site-packages (from pandas) (2.3.2)
Requirement already satisfied: python-dateutil>=2.8.2 in d:\python\python313\lib\site-packages (from pandas) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in d:\python\python313\lib\site-packages (from pandas) (2025.2)
Requirement already satisfied: tzdata>=2022.7 in d:\python\python313\lib\site-packages (from pandas) (2025.2)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Requirement already satisfied: six>=1.5 in d:\python\python313\lib\site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Using cached et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl

   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   ---------------------------------------- 2/2 [openpyxl]

Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5
In [64]:
!pip install scipy
Collecting scipy
  Downloading scipy-1.16.1-cp313-cp313-win_amd64.whl.metadata (60 kB)
Requirement already satisfied: numpy<2.6,>=1.25.2 in d:\python\python313\lib\site-packages (from scipy) (2.3.2)
Downloading scipy-1.16.1-cp313-cp313-win_amd64.whl (38.5 MB)
   ---------------------------------------- 0.0/38.5 MB ? eta -:--:--
   ---- ----------------------------------- 3.9/38.5 MB 20.8 MB/s eta 0:00:02
   ------- -------------------------------- 7.3/38.5 MB 17.7 MB/s eta 0:00:02
   ----------- ---------------------------- 11.5/38.5 MB 18.5 MB/s eta 0:00:02
   --------------- ------------------------ 15.2/38.5 MB 18.3 MB/s eta 0:00:02
   -------------------- ------------------- 19.9/38.5 MB 19.2 MB/s eta 0:00:01
   ----------------------- ---------------- 22.3/38.5 MB 18.1 MB/s eta 0:00:01
   --------------------------- ------------ 26.2/38.5 MB 18.1 MB/s eta 0:00:01
   ------------------------------- -------- 30.1/38.5 MB 18.2 MB/s eta 0:00:01
   ----------------------------------- ---- 34.3/38.5 MB 18.5 MB/s eta 0:00:01
   ---------------------------------------  38.3/38.5 MB 18.4 MB/s eta 0:00:01
   ---------------------------------------- 38.5/38.5 MB 17.9 MB/s  0:00:02
Installing collected packages: scipy
Successfully installed scipy-1.16.1
In [2]:
import pandas as pd
import numpy as np

from scipy import stats
from scipy.stats import t

import re
from collections import Counter

from IPython.display import Image, display
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
pio.renderers.default
import plotly.colors as colors
pio.templates.default = 'plotly_white'
pio.renderers.default = "notebook"

1. Data Processing¶

1.1 Data Load¶

In [4]:
behaviour = pd.read_csv('QVI_purchase_behaviour.csv')
behaviour.head()
Out[4]:
LYLTY_CARD_NBR LIFESTAGE PREMIUM_CUSTOMER
0 1000 YOUNG SINGLES/COUPLES Premium
1 1002 YOUNG SINGLES/COUPLES Mainstream
2 1003 YOUNG FAMILIES Budget
3 1004 OLDER SINGLES/COUPLES Mainstream
4 1005 MIDAGE SINGLES/COUPLES Mainstream
In [5]:
transaction = pd.read_excel('QVI_transaction_data.xlsx')
transaction.head()
Out[5]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES
0 2018-10-17 1 1000 1 5 Natural Chip Compny SeaSalt175g 2 6.0
1 2019-05-14 1 1307 348 66 CCs Nacho Cheese 175g 3 6.3
2 2019-05-20 1 1343 383 61 Smiths Crinkle Cut Chips Chicken 170g 2 2.9
3 2018-08-17 2 2373 974 69 Smiths Chip Thinly S/Cream&Onion 175g 5 15.0
4 2018-08-18 2 2426 1038 108 Kettle Tortilla ChpsHny&Jlpno Chili 150g 3 13.8

1.2 Data Cleaning¶

Check null data, outliers. According to info, we decide how to deal with these data (Directly delete or replace).

In [6]:
behaviour.describe(include = [object])
Out[6]:
LIFESTAGE PREMIUM_CUSTOMER
count 72637 72637
unique 7 3
top RETIREES Mainstream
freq 14805 29245
In [7]:
behaviour.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72637 entries, 0 to 72636
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   LYLTY_CARD_NBR    72637 non-null  int64 
 1   LIFESTAGE         72637 non-null  object
 2   PREMIUM_CUSTOMER  72637 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.7+ MB
In [8]:
transaction.describe()
Out[8]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_QTY TOT_SALES
count 264836 264836.00000 2.648360e+05 2.648360e+05 264836.000000 264836.000000 264836.000000
mean 2018-12-30 00:52:12.879215616 135.08011 1.355495e+05 1.351583e+05 56.583157 1.907309 7.304200
min 2018-07-01 00:00:00 1.00000 1.000000e+03 1.000000e+00 1.000000 1.000000 1.500000
25% 2018-09-30 00:00:00 70.00000 7.002100e+04 6.760150e+04 28.000000 2.000000 5.400000
50% 2018-12-30 00:00:00 130.00000 1.303575e+05 1.351375e+05 56.000000 2.000000 7.400000
75% 2019-03-31 00:00:00 203.00000 2.030942e+05 2.027012e+05 85.000000 2.000000 9.200000
max 2019-06-30 00:00:00 272.00000 2.373711e+06 2.415841e+06 114.000000 200.000000 650.000000
std NaN 76.78418 8.057998e+04 7.813303e+04 32.826638 0.643654 3.083226
In [9]:
transaction.describe(include = [object])
Out[9]:
PROD_NAME
count 264836
unique 114
top Kettle Mozzarella Basil & Pesto 175g
freq 3304
In [10]:
transaction.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264836 entries, 0 to 264835
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   DATE            264836 non-null  datetime64[ns]
 1   STORE_NBR       264836 non-null  int64         
 2   LYLTY_CARD_NBR  264836 non-null  int64         
 3   TXN_ID          264836 non-null  int64         
 4   PROD_NBR        264836 non-null  int64         
 5   PROD_NAME       264836 non-null  object        
 6   PROD_QTY        264836 non-null  int64         
 7   TOT_SALES       264836 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(5), object(1)
memory usage: 16.2+ MB
In [13]:
# There is no null value. So I try to clean the outliers
fig = px.box(transaction, y = 'TOT_SALES', title = 'Boxplot of Total Sales')
fig.show()
In [14]:
fig = px.box(transaction, y = 'PROD_QTY', title = 'Boxplot of Product Quantities')
fig.show()
In [15]:
# Find outliers
outlier_record = transaction[transaction['PROD_QTY'] == 200]
print(outlier_record)
            DATE  STORE_NBR  LYLTY_CARD_NBR  TXN_ID  PROD_NBR  \
69762 2018-08-19        226          226000  226201         4   
69763 2019-05-20        226          226000  226210         4   

                              PROD_NAME  PROD_QTY  TOT_SALES  
69762  Dorito Corn Chp     Supreme 380g       200      650.0  
69763  Dorito Corn Chp     Supreme 380g       200      650.0  
In [16]:
transaction = transaction[transaction['PROD_QTY'] != 200]
transaction.describe()
Out[16]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_QTY TOT_SALES
count 264834 264834.000000 2.648340e+05 2.648340e+05 264834.000000 264834.000000 264834.000000
mean 2018-12-30 00:52:10.292938240 135.079423 1.355488e+05 1.351576e+05 56.583554 1.905813 7.299346
min 2018-07-01 00:00:00 1.000000 1.000000e+03 1.000000e+00 1.000000 1.000000 1.500000
25% 2018-09-30 00:00:00 70.000000 7.002100e+04 6.760050e+04 28.000000 2.000000 5.400000
50% 2018-12-30 00:00:00 130.000000 1.303570e+05 1.351365e+05 56.000000 2.000000 7.400000
75% 2019-03-31 00:00:00 203.000000 2.030940e+05 2.026998e+05 85.000000 2.000000 9.200000
max 2019-06-30 00:00:00 272.000000 2.373711e+06 2.415841e+06 114.000000 5.000000 29.500000
std NaN 76.784063 8.057990e+04 7.813292e+04 32.826444 0.343436 2.527241

1.3 Merge 2 Datasets¶

Merge these 2 datasets by LYLTY_CARD_NBR so we can better do data analytics. Then check if there are any illegal data.

In [17]:
merged = transaction.merge(behaviour, on = 'LYLTY_CARD_NBR', how = 'left')
merged.head()
Out[17]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES LIFESTAGE PREMIUM_CUSTOMER
0 2018-10-17 1 1000 1 5 Natural Chip Compny SeaSalt175g 2 6.0 YOUNG SINGLES/COUPLES Premium
1 2019-05-14 1 1307 348 66 CCs Nacho Cheese 175g 3 6.3 MIDAGE SINGLES/COUPLES Budget
2 2019-05-20 1 1343 383 61 Smiths Crinkle Cut Chips Chicken 170g 2 2.9 MIDAGE SINGLES/COUPLES Budget
3 2018-08-17 2 2373 974 69 Smiths Chip Thinly S/Cream&Onion 175g 5 15.0 MIDAGE SINGLES/COUPLES Budget
4 2018-08-18 2 2426 1038 108 Kettle Tortilla ChpsHny&Jlpno Chili 150g 3 13.8 MIDAGE SINGLES/COUPLES Budget
In [18]:
merged.describe()
Out[18]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_QTY TOT_SALES
count 264834 264834.000000 2.648340e+05 2.648340e+05 264834.000000 264834.000000 264834.000000
mean 2018-12-30 00:52:10.292938240 135.079423 1.355488e+05 1.351576e+05 56.583554 1.905813 7.299346
min 2018-07-01 00:00:00 1.000000 1.000000e+03 1.000000e+00 1.000000 1.000000 1.500000
25% 2018-09-30 00:00:00 70.000000 7.002100e+04 6.760050e+04 28.000000 2.000000 5.400000
50% 2018-12-30 00:00:00 130.000000 1.303570e+05 1.351365e+05 56.000000 2.000000 7.400000
75% 2019-03-31 00:00:00 203.000000 2.030940e+05 2.026998e+05 85.000000 2.000000 9.200000
max 2019-06-30 00:00:00 272.000000 2.373711e+06 2.415841e+06 114.000000 5.000000 29.500000
std NaN 76.784063 8.057990e+04 7.813292e+04 32.826444 0.343436 2.527241
In [19]:
merged.describe(include = [object])
Out[19]:
PROD_NAME LIFESTAGE PREMIUM_CUSTOMER
count 264834 264834 264834
unique 114 7 3
top Kettle Mozzarella Basil & Pesto 175g OLDER SINGLES/COUPLES Mainstream
freq 3304 54479 101988
In [20]:
merged.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 264834 entries, 0 to 264833
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   DATE              264834 non-null  datetime64[ns]
 1   STORE_NBR         264834 non-null  int64         
 2   LYLTY_CARD_NBR    264834 non-null  int64         
 3   TXN_ID            264834 non-null  int64         
 4   PROD_NBR          264834 non-null  int64         
 5   PROD_NAME         264834 non-null  object        
 6   PROD_QTY          264834 non-null  int64         
 7   TOT_SALES         264834 non-null  float64       
 8   LIFESTAGE         264834 non-null  object        
 9   PREMIUM_CUSTOMER  264834 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(5), object(3)
memory usage: 20.2+ MB
In [21]:
# Check if all the products are chips.
non_chips = merged[~merged['PROD_NAME'].str.contains('chip', case = False, na = False)]
print(non_chips['PROD_NAME'].drop_duplicates().head(20))
print("Product Names without Chips(s):", len(non_chips))
1                     CCs Nacho Cheese    175g
4     Kettle Tortilla ChpsHny&Jlpno Chili 150g
5     Old El Paso Salsa   Dip Tomato Mild 300g
7        Grain Waves         Sweet Chilli 210g
9        Grain Waves Sour    Cream&Chives 210G
11       Kettle Sensations   Siracha Lime 150g
12                    Twisties Cheese     270g
13            WW Crinkle Cut      Chicken 175g
15                           CCs Original 175g
16                           Burger Rings 220g
17      NCC Sour Cream &    Garden Chives 175g
19                    Cheezels Cheese Box 125g
20           Smiths Crinkle      Original 330g
22      Infzns Crn Crnchers Tangy Gcamole 110g
23        Kettle Sea Salt     And Vinegar 175g
27                        Kettle Original 175g
28        Red Rock Deli Thai  Chilli&Lime 150g
30            Pringles Sthrn FriedChicken 134g
31                Pringles Sweet&Spcy BBQ 134g
32    Red Rock Deli SR    Salsa & Mzzrlla 150g
Name: PROD_NAME, dtype: object
Product Names without Chips(s): 190264

I couldn't clearly say these products were not chips, so I just left them here because they took a large part of data (190K+)

1.4 Feature Engineering¶

Though we've merged these two datasets, we still need to create some new features to conveniently analyze data later. Here I created 4 new columns: PACK_SIZE(g), BRAND, PRICE, YEAR, YEARMONTH

In [22]:
#According to product name, create a pack-size column.
merged['PACK_SIZE(g)'] = merged['PROD_NAME'].str.extract(r'(?i)(\d+)\s*g')
merged['PACK_SIZE(g)'] = merged['PACK_SIZE(g)'].astype('Int64')
merged.head()
Out[22]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES LIFESTAGE PREMIUM_CUSTOMER PACK_SIZE(g)
0 2018-10-17 1 1000 1 5 Natural Chip Compny SeaSalt175g 2 6.0 YOUNG SINGLES/COUPLES Premium 175
1 2019-05-14 1 1307 348 66 CCs Nacho Cheese 175g 3 6.3 MIDAGE SINGLES/COUPLES Budget 175
2 2019-05-20 1 1343 383 61 Smiths Crinkle Cut Chips Chicken 170g 2 2.9 MIDAGE SINGLES/COUPLES Budget 170
3 2018-08-17 2 2373 974 69 Smiths Chip Thinly S/Cream&Onion 175g 5 15.0 MIDAGE SINGLES/COUPLES Budget 175
4 2018-08-18 2 2426 1038 108 Kettle Tortilla ChpsHny&Jlpno Chili 150g 3 13.8 MIDAGE SINGLES/COUPLES Budget 150
In [23]:
merged['PACK_SIZE(g)'].isna().sum() 
Out[23]:
np.int64(0)
In [24]:
merged['PACK_SIZE(g)'].describe()
Out[24]:
count      264834.0
mean     182.425512
std       64.325148
min            70.0
25%           150.0
50%           170.0
75%           175.0
max           380.0
Name: PACK_SIZE(g), dtype: Float64
In [25]:
fig = px.histogram(merged, x = 'PACK_SIZE(g)', nbins=len(merged['PACK_SIZE(g)'].unique()), 
                  title="Distribution of Transactions by Pack Size")
fig.show()
In [26]:
#Extract brand name according to product name's first word
merged['BRAND'] = merged['PROD_NAME'].str.split().str[0]
merged.head()
Out[26]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES LIFESTAGE PREMIUM_CUSTOMER PACK_SIZE(g) BRAND
0 2018-10-17 1 1000 1 5 Natural Chip Compny SeaSalt175g 2 6.0 YOUNG SINGLES/COUPLES Premium 175 Natural
1 2019-05-14 1 1307 348 66 CCs Nacho Cheese 175g 3 6.3 MIDAGE SINGLES/COUPLES Budget 175 CCs
2 2019-05-20 1 1343 383 61 Smiths Crinkle Cut Chips Chicken 170g 2 2.9 MIDAGE SINGLES/COUPLES Budget 170 Smiths
3 2018-08-17 2 2373 974 69 Smiths Chip Thinly S/Cream&Onion 175g 5 15.0 MIDAGE SINGLES/COUPLES Budget 175 Smiths
4 2018-08-18 2 2426 1038 108 Kettle Tortilla ChpsHny&Jlpno Chili 150g 3 13.8 MIDAGE SINGLES/COUPLES Budget 150 Kettle
In [27]:
merged['BRAND'].describe(include = [object])
Out[27]:
count     264834
unique        29
top       Kettle
freq       41288
Name: BRAND, dtype: object
In [28]:
# Check unique brands and see if we can combine some. Sometimes different outcomes could be the same brand because of the spelling.
brand_counts = merged['BRAND'].value_counts()
print(brand_counts)
BRAND
Kettle        41288
Smiths        28860
Pringles      25102
Doritos       24962
Thins         14075
RRD           11894
Infuzions     11057
WW            10320
Cobs           9693
Tostitos       9471
Twisties       9454
Old            9324
Tyrrells       6442
Grain          6272
Natural        6050
Red            5885
Cheezels       4603
CCs            4551
Woolworths     4437
Dorito         3183
Infzns         3144
Smith          2963
Cheetos        2927
Snbts          1576
Burger         1564
GrnWves        1468
Sunbites       1432
NCC            1419
French         1418
Name: count, dtype: int64
In [29]:
brand_map = {
    "Dorito": "Doritos",
    "Smith": "Smiths",
    "Red": "RRD",
    "Snbts": "Sunbites"
}
merged['BRAND'] = merged['BRAND'].replace(brand_map)
merged['BRAND'].describe()
Out[29]:
count     264834
unique        25
top       Kettle
freq       41288
Name: BRAND, dtype: object
In [30]:
print(merged['BRAND'].value_counts())
BRAND
Kettle        41288
Smiths        31823
Doritos       28145
Pringles      25102
RRD           17779
Thins         14075
Infuzions     11057
WW            10320
Cobs           9693
Tostitos       9471
Twisties       9454
Old            9324
Tyrrells       6442
Grain          6272
Natural        6050
Cheezels       4603
CCs            4551
Woolworths     4437
Infzns         3144
Sunbites       3008
Cheetos        2927
Burger         1564
GrnWves        1468
NCC            1419
French         1418
Name: count, dtype: int64
In [31]:
merged['PRICE'] = merged['TOT_SALES'] / merged['PROD_QTY']
merged
Out[31]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES LIFESTAGE PREMIUM_CUSTOMER PACK_SIZE(g) BRAND PRICE
0 2018-10-17 1 1000 1 5 Natural Chip Compny SeaSalt175g 2 6.0 YOUNG SINGLES/COUPLES Premium 175 Natural 3.00
1 2019-05-14 1 1307 348 66 CCs Nacho Cheese 175g 3 6.3 MIDAGE SINGLES/COUPLES Budget 175 CCs 2.10
2 2019-05-20 1 1343 383 61 Smiths Crinkle Cut Chips Chicken 170g 2 2.9 MIDAGE SINGLES/COUPLES Budget 170 Smiths 1.45
3 2018-08-17 2 2373 974 69 Smiths Chip Thinly S/Cream&Onion 175g 5 15.0 MIDAGE SINGLES/COUPLES Budget 175 Smiths 3.00
4 2018-08-18 2 2426 1038 108 Kettle Tortilla ChpsHny&Jlpno Chili 150g 3 13.8 MIDAGE SINGLES/COUPLES Budget 150 Kettle 4.60
... ... ... ... ... ... ... ... ... ... ... ... ... ...
264829 2019-03-09 272 272319 270088 89 Kettle Sweet Chilli And Sour Cream 175g 2 10.8 YOUNG SINGLES/COUPLES Premium 175 Kettle 5.40
264830 2018-08-13 272 272358 270154 74 Tostitos Splash Of Lime 175g 1 4.4 YOUNG SINGLES/COUPLES Premium 175 Tostitos 4.40
264831 2018-11-06 272 272379 270187 51 Doritos Mexicana 170g 2 8.8 YOUNG SINGLES/COUPLES Premium 170 Doritos 4.40
264832 2018-12-27 272 272379 270188 42 Doritos Corn Chip Mexican Jalapeno 150g 2 7.8 YOUNG SINGLES/COUPLES Premium 150 Doritos 3.90
264833 2018-09-22 272 272380 270189 74 Tostitos Splash Of Lime 175g 2 8.8 YOUNG SINGLES/COUPLES Premium 175 Tostitos 4.40

264834 rows × 13 columns

In [32]:
#Create 'year', 'month', 'day of week' columns to see if there is seasonal influence
merged['YEAR'] = merged['DATE'].dt.year
merged['YEARMONTH'] = merged['DATE'].dt.to_period('M')
merged.head()
Out[32]:
DATE STORE_NBR LYLTY_CARD_NBR TXN_ID PROD_NBR PROD_NAME PROD_QTY TOT_SALES LIFESTAGE PREMIUM_CUSTOMER PACK_SIZE(g) BRAND PRICE YEAR YEARMONTH
0 2018-10-17 1 1000 1 5 Natural Chip Compny SeaSalt175g 2 6.0 YOUNG SINGLES/COUPLES Premium 175 Natural 3.00 2018 2018-10
1 2019-05-14 1 1307 348 66 CCs Nacho Cheese 175g 3 6.3 MIDAGE SINGLES/COUPLES Budget 175 CCs 2.10 2019 2019-05
2 2019-05-20 1 1343 383 61 Smiths Crinkle Cut Chips Chicken 170g 2 2.9 MIDAGE SINGLES/COUPLES Budget 170 Smiths 1.45 2019 2019-05
3 2018-08-17 2 2373 974 69 Smiths Chip Thinly S/Cream&Onion 175g 5 15.0 MIDAGE SINGLES/COUPLES Budget 175 Smiths 3.00 2018 2018-08
4 2018-08-18 2 2426 1038 108 Kettle Tortilla ChpsHny&Jlpno Chili 150g 3 13.8 MIDAGE SINGLES/COUPLES Budget 150 Kettle 4.60 2018 2018-08

2. Customer Behaviour Analysis¶

Here I aggregated customer count by lifestage and premium_customer columns. Then checked total sales, customer counts, average price and average spend of each group.

The weekly transactions over time shows the number of chip transactions has remained relatively consistent over the last 52 weeks; a notable increase occurred in the week leading up to Christmas.

In [33]:
weekly = merged.copy()

weekly['WEEK'] = weekly['DATE'].dt.to_period('W-MON').apply(lambda r: r.start_time)

weekly_plot = (weekly.groupby('WEEK')['TXN_ID']
              .nunique()
              .reset_index(name = 'transactions')
              .sort_values('WEEK')
              .tail(52))

fig_weekly = px.line(
    weekly_plot, x = 'WEEK', y = 'transactions',
    title = 'Chips - Weekly Transactions over Time',
    markers = True
)

fig_weekly.update_traces(line=dict(color='orange'), marker=dict(color='orange'))

fig_weekly.update_layout(
    yaxis_title = 'Number of Transactions',
    xaxis_title = None
)
fig_weekly.show()

2.1 Customer Analysis by Lifestage¶

When users are categorized by lifestage, older singles/couples have the highest overall sales.

But new families have the highest average customer spending of $3.9.

Older families and young families have the highest average spending, at approximately $36.

As older singles/couples contribute the most to the total sales. I did some analytics to deep dive their preferences.

In [34]:
# Customer distribution by lifestage
lifestage_counts = behaviour['LIFESTAGE'].value_counts()
lifestage_counts
Out[34]:
LIFESTAGE
RETIREES                  14805
OLDER SINGLES/COUPLES     14609
YOUNG SINGLES/COUPLES     14441
OLDER FAMILIES             9780
YOUNG FAMILIES             9178
MIDAGE SINGLES/COUPLES     7275
NEW FAMILIES               2549
Name: count, dtype: int64
In [35]:
# Calculate total sales and per sales by customer lifestage and premium status
sales_by_lifestage = merged.groupby(['LIFESTAGE'])['TOT_SALES'].sum().reset_index()
customer_by_lifestage = lifestage_counts.reset_index()
customer_by_lifestage.columns = ['LIFESTAGE', 'CUSTOMER_COUNT']

avg_price_by_lifestage = merged.groupby(['LIFESTAGE'])['PRICE'].mean().reset_index()
avg_price_by_lifestage.columns = ['LIFESTAGE', 'AVG_PRICE']

lifestage_summary = sales_by_lifestage.merge(customer_by_lifestage, on='LIFESTAGE')
lifestage_summary = lifestage_summary.merge(avg_price_by_lifestage, on='LIFESTAGE')

lifestage_summary['AVG_SPEND'] = (lifestage_summary['TOT_SALES'] / lifestage_summary['CUSTOMER_COUNT'])
lifestage_summary = lifestage_summary.sort_values('TOT_SALES', ascending = False)
lifestage_summary
Out[35]:
LIFESTAGE TOT_SALES CUSTOMER_COUNT AVG_PRICE AVG_SPEND
3 OLDER SINGLES/COUPLES 402426.75 14609 3.855725 27.546495
4 RETIREES 366470.90 14805 3.884459 24.753185
2 OLDER FAMILIES 352467.20 9780 3.724983 36.039591
5 YOUNG FAMILIES 316160.10 9178 3.739539 34.447603
6 YOUNG SINGLES/COUPLES 260405.30 14441 3.887723 18.032359
0 MIDAGE SINGLES/COUPLES 184751.30 7275 3.864922 25.395368
1 NEW FAMILIES 50433.45 2549 3.907443 19.785583
In [36]:
# We can see that older singles/couples contribute most to the total sales. Let's see their preferences.
# Pack size distribution
seg = merged.loc[merged['LIFESTAGE'] == 'OLDER SINGLES/COUPLES'].copy()
pack_counts = (seg['PACK_SIZE(g)']
               .value_counts()
               .sort_index()
               .reset_index())
pack_counts.columns = ['PACK_SIZE(g)', 'COUNT']
pack_counts['SHARE'] = pack_counts['COUNT'] / pack_counts['COUNT'].sum()

fig_2_1_1 = px.bar(
    pack_counts,
    x='PACK_SIZE(g)', y='COUNT',
    title='Pack-size Distribution — OLDER SINGLES/COUPLES (Count)',
    text=pack_counts['SHARE'].map(lambda v: f'{v:.1%}')
)
fig_2_1_1.update_layout(xaxis_title = 'Pack Size (g)', yaxis_title = 'Transactions',
                   xaxis=dict(type = 'category'))
fig_2_1_1.update_traces(marker_color = 'orange', textposition = 'outside', cliponaxis = False)
fig_2_1_1.show()
In [37]:
# Brand distribution
brand_counts = (seg['BRAND']
                .dropna()
                .value_counts()
                .reset_index())
brand_counts.columns = ['BRAND', 'COUNT']
brand_counts['SHARE'] = brand_counts['COUNT'] / brand_counts['COUNT'].sum()

topN = 15
brand_top_share = brand_counts.sort_values('SHARE', ascending=False).head(topN)

fig_2_1_2 = px.bar(
    brand_top_share,
    x = 'SHARE', y = 'BRAND',
    orientation = 'h',
    text=brand_top_share['SHARE'].map(lambda v: f'{v:.1%}'),
    title = f'Brand Share — OLDER SINGLES/COUPLES (Top {topN})',
    color_discrete_sequence=['orange']
)
fig_2_1_2.update_yaxes(autorange = 'reversed')
fig_2_1_2.update_xaxes(tickformat = '.0%')
fig_2_1_2.update_layout(xaxis_title = 'Share', yaxis_title = 'Brand')
fig_2_1_2.show()

We could conclude that older singles/couples preferred pack size of 175g and Kettle's chips.

2.2 Customer Analysis by Premium Class¶

Mainstream customers contribute the most to the total sales and customer counts. Their average spending price is also the highest, at $3.86. However, customers with budget have the highest average spending.

As mainstream customers have the highest average spending price, I did t-test to see if there are significant difference between mainstream customers and other customers. Outcome shows there is significant difference in avg price.

In [38]:
# Customer distribution by premium class
premium_counts = behaviour['PREMIUM_CUSTOMER'].value_counts()
premium_counts
Out[38]:
PREMIUM_CUSTOMER
Mainstream    29245
Budget        24470
Premium       18922
Name: count, dtype: int64
In [39]:
sales_by_premium = merged.groupby(['PREMIUM_CUSTOMER'])['TOT_SALES'].sum().reset_index()
customer_by_premium = premium_counts.reset_index()
customer_by_premium.columns = ['PREMIUM_CUSTOMER', 'CUSTOMER_COUNT']

avg_price_by_premium = merged.groupby(['PREMIUM_CUSTOMER'])['PRICE'].mean().reset_index()
avg_price_by_premium.columns = ['PREMIUM_CUSTOMER', 'AVG_PRICE']

premium_summary = sales_by_premium.merge(customer_by_premium, on='PREMIUM_CUSTOMER')
premium_summary = premium_summary.merge(avg_price_by_premium, on='PREMIUM_CUSTOMER')

premium_summary['AVG_SPEND'] = (premium_summary['TOT_SALES'] / premium_summary['CUSTOMER_COUNT'])
premium_summary = premium_summary.sort_values('TOT_SALES', ascending = False)
premium_summary
Out[39]:
PREMIUM_CUSTOMER TOT_SALES CUSTOMER_COUNT AVG_PRICE AVG_SPEND
1 Mainstream 750744.50 29245 3.867268 25.670867
0 Budget 676211.55 24470 3.793403 27.634309
2 Premium 506158.95 18922 3.803969 26.749760

Mainstream customers' total sales is the highest as they take the most part of total customers. However, customers with budget have the highest average spending.

In [40]:
sales_by_segment = (merged
                    .groupby(['LIFESTAGE', 'PREMIUM_CUSTOMER'])['TOT_SALES']
                    .sum()
                    .reset_index())
fig_2_2_1 = px.bar(sales_by_segment, x = 'LIFESTAGE', y = 'TOT_SALES', 
            color = 'PREMIUM_CUSTOMER',
            title = 'Total Sales by Customer Lifestage and Premium Status',
            barmode = 'group',
            color_discrete_map={
                'Budget': 'lightblue',
                'Mainstream': 'orange',
                'Premium': '#1f77b4'}
                  )
fig_2_2_1.show()

Families behave similarly, singles behave similarly (Except for older singles/couples)

In [41]:
# T-test to see if there was a signicant difference between mainsteam's avg price and other customers' avg price
# t-test assumptions: sample independence, countinuous variable, normal distribution, homogeneity of variance, proper sample size
mainstream_price = merged.loc[merged['PREMIUM_CUSTOMER'] == 'Mainstream', 'PRICE']
other_price = merged.loc[merged['PREMIUM_CUSTOMER'] != 'Mainstream', 'PRICE']

t_stat, p_value = stats.ttest_ind(mainstream_price, other_price, equal_var = False)
print("t-statistic:", t_stat)
print("p-value:", p_value)
t-statistic: 15.723599554376927
p-value: 1.1180188026786365e-55

p-value < 0.05, so we declined H0, which meant there was significant avg price difference between mainstream customers and other customers.

In [42]:
# Pack size distribution
seg_prem = merged.loc[merged['PREMIUM_CUSTOMER'] == 'Mainstream'].copy()
pack_counts_prem = (seg_prem['PACK_SIZE(g)']
               .value_counts(normalize = True)
               .sort_index()
               .reset_index())
pack_counts_prem.columns = ['PACK_SIZE(g)', 'SHARE']
pack_counts_prem['GROUP'] = 'Mainstream'

seg_other = merged.loc[merged['PREMIUM_CUSTOMER'] != 'Mainstream'].copy()
pack_counts_other = seg_other['PACK_SIZE(g)'].value_counts(normalize = True).sort_index().reset_index()
pack_counts_other.columns = ['PACK_SIZE(g)', 'SHARE']
pack_counts_other['GROUP'] = 'Others'

#Combine these 2 charts
pack_compare = pd.concat([pack_counts_prem, pack_counts_other])

fig_2_2_2 = px.bar(
    pack_compare,
    x='PACK_SIZE(g)', y='SHARE',
    color = 'GROUP', barmode = 'group',
    text=pack_compare['SHARE'].map(lambda v: f'{v:.1%}'),
    title='Pack-size Distribution — Mainstream VS Others (Share)',
    color_discrete_sequence=['orange', 'lightblue']
)
fig_2_2_2.update_layout(xaxis_title = 'Pack Size (g)', 
                        yaxis_title = 'Transactions',
                        xaxis=dict(type = 'category'),
                        yaxis_tickformat='.0%')
fig_2_2_2.update_traces(textposition = 'outside', cliponaxis = False)
fig_2_2_2.show()
In [43]:
# Brand distribution
brand_prem = (seg_prem['BRAND']
               .value_counts(normalize = True)
               .sort_index()
               .reset_index())
brand_prem.columns = ['BRAND', 'SHARE']
brand_prem['GROUP'] = 'Mainstream'

brand_other = seg_other['BRAND'].value_counts(normalize = True).sort_index().reset_index()
brand_other.columns = ['BRAND', 'SHARE']
brand_other['GROUP'] = 'Others'

#Combine these 2 charts
brand_compare = pd.concat([brand_prem, brand_other])

#Choose top 8 brands
topN = 12
brand_top_compare = brand_compare.sort_values('SHARE', ascending=False).head(topN)

fig_2_2_3 = px.bar(
    brand_top_compare,
    x='BRAND', y='SHARE',
    color = 'GROUP', barmode = 'group',
    text=brand_top_compare['SHARE'].map(lambda v: f'{v:.1%}'),
    title='Brand Distribution — Mainstream VS Others (Share)',
    color_discrete_sequence=['orange', 'lightblue']
)
fig_2_2_3.update_layout(xaxis_title = 'Brand', 
                        yaxis_title = 'Transactions',
                        xaxis=dict(type = 'category'),
                        yaxis_tickformat='.0%')
fig_2_2_3.update_traces(textposition = 'outside', cliponaxis = False)
fig_2_2_3.show()

From charts above, we could see mainstream customers prefer smaller package and Kettle and Doritos.

3. Experimentation and Uplift Testing¶

In this part, I choose 3 stores as trial stores: store 77, 86, and 88.

2018 is my pre-trial period and 2019 is my trial period.

My goal is to choose 3 control stores which are very similar to trial stores according to data of 2018. And use data of 2019 to see if there is significant difference between trial stores and control stores in 2019.

Here 'similar' means they're similar in Sales and Customers. These 2 metrics are something I want to focus on.

3.1 Generate Monthly Chart with Key Metrics¶

Aggregate store data by YEARMONTH column

In [44]:
monthly = (
    merged.groupby(['STORE_NBR','YEARMONTH'])
          .agg(
              totSales=('TOT_SALES','sum'),                # total sales
              nCustomers=('LYLTY_CARD_NBR','nunique'),     # total customers
              nTxns=('TXN_ID','nunique'),                  # total transactions
              units=('PROD_QTY','sum')                     # total quantities
          )
          .reset_index()
)

# other key metrics
monthly['nTxnPerCust']    = monthly['nTxns'] / monthly['nCustomers'] #avg transactions per customer
monthly['nChipsPerTxn']   = monthly['units'] / monthly['nTxns'] #avg quantity per transaction
monthly['avgPricePerUnit']= monthly['totSales'] / monthly['units'] #avg transaction price

monthly.head(10)
Out[44]:
STORE_NBR YEARMONTH totSales nCustomers nTxns units nTxnPerCust nChipsPerTxn avgPricePerUnit
0 1 2018-07 206.9 49 52 62 1.061224 1.192308 3.337097
1 1 2018-08 176.1 42 43 54 1.023810 1.255814 3.261111
2 1 2018-09 278.8 59 62 75 1.050847 1.209677 3.717333
3 1 2018-10 188.1 44 45 58 1.022727 1.288889 3.243103
4 1 2018-11 192.6 46 47 57 1.021739 1.212766 3.378947
5 1 2018-12 189.6 42 47 57 1.119048 1.212766 3.326316
6 1 2019-01 154.8 35 36 42 1.028571 1.166667 3.685714
7 1 2019-02 225.4 52 55 65 1.057692 1.181818 3.467692
8 1 2019-03 192.9 45 49 58 1.088889 1.183673 3.325862
9 1 2019-04 192.9 42 43 57 1.023810 1.325581 3.384211

3.2 Calculate Pearson Correlation to Find Stores with Similar Trends¶

In [45]:
# Define a function to calculate Pearson Correlation of Sales and Customers
def calculateCorrelation(inputTable: pd.DataFrame,
                         metricCol: str,
                         storeComparison: int,
                         pre_year: int = 2019, # Set 2018 as pre-trial year. So year(YEARMONTH) needs to < 2019.
                         min_periods: int = 3) -> pd.DataFrame:
    """
    On the pre-test (year < pre_year), calculate the Pearson correlation coefficient between the test store's 
    storeComparison and the other stores in metricCol.
    Correlation is only calculated when the overlapping months are greater than or equal to min_periods 
    and the std is greater than 0.
    Return columns: Store1, Store2, corr_measure
    """
    df = inputTable.copy()

    # Only use data in pre-trial period
    df_pre = df[df['YEARMONTH'].dt.year < pre_year]

    # Store x YEARMONTH matrix
    wide = df_pre.pivot(index='STORE_NBR', columns='YEARMONTH', values=metricCol).sort_index(axis=1)

    if storeComparison not in wide.index:
        return pd.DataFrame(columns=['Store1','Store2','corr_measure'])

    trial = wide.loc[storeComparison]

    def safe_corr(s):
        # Combine 2 charts using YEARMONTH
        pair = pd.concat([trial, s], axis=1, join='inner').dropna()
        if len(pair) < min_periods:
            return np.nan
        x, y = pair.iloc[:,0], pair.iloc[:,1]
        # Correlation is null when variance is 0
        if x.std(ddof=1) == 0 or y.std(ddof=1) == 0:
            return np.nan
        return x.corr(y)  # Pearson

    corr = wide.apply(safe_corr, axis=1)

    # Remove itself and NA
    corr = corr.drop(index=storeComparison, errors='ignore').dropna()

    out = (corr.rename('corr_measure')
               .reset_index()
               .rename(columns={'STORE_NBR':'Store2'}))
    out.insert(0, 'Store1', storeComparison)
    out = out.sort_values('corr_measure', ascending=False, ignore_index=True)
    return out


corr_sales_77 = calculateCorrelation(monthly, 'totSales', storeComparison=77)
corr_cust_77  = calculateCorrelation(monthly, 'nCustomers', storeComparison=77)

print("📊 Correlation of Sales (Store 77 vs others)")
display(corr_sales_77.head())
print("📊 Correlation of Customers (Store 77 vs others)")
display(corr_cust_77.head())
📊 Correlation of Sales (Store 77 vs others)
Store1 Store2 corr_measure
0 77 71 0.944303
1 77 63 0.932288
2 77 119 0.885916
3 77 233 0.869930
4 77 3 0.853302
📊 Correlation of Customers (Store 77 vs others)
Store1 Store2 corr_measure
0 77 233 0.991585
1 77 119 0.980324
2 77 254 0.961820
3 77 71 0.896644
4 77 48 0.890898
In [46]:
corr_sales_86 = calculateCorrelation(monthly, 'totSales', storeComparison=86)
corr_cust_86 = calculateCorrelation(monthly, 'nCustomers', storeComparison=86)

print("📊 Correlation of Sales (Store 86 vs others)")
display(corr_sales_86.head())
print("📊 Correlation of Customers (Store 86 vs others)")
display(corr_cust_86.head())
📊 Correlation of Sales (Store 86 vs others)
Store1 Store2 corr_measure
0 86 178 0.908254
1 86 22 0.886097
2 86 155 0.876103
3 86 138 0.850964
4 86 132 0.837883
📊 Correlation of Customers (Store 86 vs others)
Store1 Store2 corr_measure
0 86 155 0.954810
1 86 147 0.946477
2 86 114 0.821730
3 86 260 0.798744
4 86 138 0.794732
In [47]:
corr_sales_88 = calculateCorrelation(monthly, 'totSales', storeComparison=88)
corr_cust_88 = calculateCorrelation(monthly, 'nCustomers', storeComparison=88)

print("📊 Correlation of Sales (Store 88 vs others)")
display(corr_sales_88.head())
print("📊 Correlation of Customers (Store 88 vs others)")
display(corr_cust_88.head())
📊 Correlation of Sales (Store 88 vs others)
Store1 Store2 corr_measure
0 88 186 0.905930
1 88 61 0.887826
2 88 159 0.885087
3 88 204 0.838289
4 88 91 0.824997
📊 Correlation of Customers (Store 88 vs others)
Store1 Store2 corr_measure
0 88 69 0.937817
1 88 237 0.901321
2 88 265 0.883245
3 88 178 0.882568
4 88 191 0.869272

3.3 Calculate Magnitude Distance to Find Stores with Similar Sizes¶

In [48]:
# Define a function to calcluate magnitude distance between trial stores and control stores.
def calculateMagnitudeDistance(inputTable: pd.DataFrame,
                               metricCol: str,
                               storeComparison: int,
                               pre_year: int = 2019,
                               min_overlap: int = 3) -> pd.DataFrame:
    """
    On the pre-trial (year < pre_year), calculate the "magnitude proximity" of the trial store to other stores 
    in metricCol.
    Steps:
    1) Store × month matrix (retain only the pre-trial)
    2) Align with the trial sequence and calculate the absolute difference month by month
    3) Min–max normalization
    4) Take the mean -> mag_measure ∈ [0,1], where a larger value indicates similarity
    Parameters:
    - min_overlap: The minimum number of months in common for comparison
    Returns:
    Store1, Store2, mag_measure (sorted by descending mag_measure)
    """
    df = inputTable.copy()

    df_pre = df[df['YEARMONTH'].dt.year < pre_year]

    # Stores x Month Matrix
    wide = df_pre.pivot(index='STORE_NBR', columns='YEARMONTH', values=metricCol).sort_index(axis=1)

    # There must be a trial store
    if storeComparison not in wide.index:
        return pd.DataFrame(columns=['Store1','Store2','mag_measure'])

    trial = wide.loc[storeComparison]

    # Calculate the abs
    diff = (wide.sub(trial, axis=1)).abs()

    # Min–max normalization
    mag = diff.apply(
        lambda col: 1 - (col - col.min()) / (col.max() - col.min()) if col.max()!=col.min() else 1,
        axis=0
    )

    # Calculate mean of each store
    mag_measure = mag.mean(axis=1, skipna=True)

    # Drop trial store itself
    mag_measure = mag_measure.drop(index=storeComparison, errors='ignore')

    out = (mag_measure.rename('mag_measure')
           .reset_index()
           .rename(columns={'STORE_NBR':'Store2'}))
    out.insert(0, 'Store1', storeComparison)
    out = out.sort_values('mag_measure', ascending=False, ignore_index=True)
    return out

mag_sales_77 = calculateMagnitudeDistance(monthly, 'totSales', storeComparison=77, pre_year=2019, min_overlap=3)
mag_cust_77  = calculateMagnitudeDistance(monthly, 'nCustomers', storeComparison=77, pre_year=2019, min_overlap=3)

print("📊 Magnitude of Sales (Store 77 vs others)")
display(mag_sales_77.head())

print("📊 Magnitude of Customers (Store 77 vs others)")
display(mag_cust_77.head())
📊 Magnitude of Sales (Store 77 vs others)
Store1 Store2 mag_measure
0 77 233 0.986204
1 77 255 0.976287
2 77 53 0.975302
3 77 141 0.975227
4 77 205 0.974286
📊 Magnitude of Customers (Store 77 vs others)
Store1 Store2 mag_measure
0 77 233 0.991569
1 77 115 0.973703
2 77 41 0.972096
3 77 111 0.968826
4 77 17 0.962979
In [49]:
mag_sales_86 = calculateMagnitudeDistance(monthly, 'totSales', storeComparison=86, pre_year=2019, min_overlap=3)
mag_cust_86 = calculateMagnitudeDistance(monthly, 'nCustomers', storeComparison=86, pre_year=2019, min_overlap=3)

print("📊 Magnitude of Sales (Store 86 vs others)")
display(mag_sales_86.head())

print("📊 Magnitude of Customers (Store 86 vs others)")
display(mag_cust_86.head())
📊 Magnitude of Sales (Store 86 vs others)
Store1 Store2 mag_measure
0 86 222 0.967912
1 86 155 0.963593
2 86 109 0.959268
3 86 225 0.949984
4 86 160 0.948399
📊 Magnitude of Customers (Store 86 vs others)
Store1 Store2 mag_measure
0 86 155 0.986277
1 86 227 0.973451
2 86 109 0.964175
3 86 225 0.962162
4 86 247 0.960127
In [50]:
mag_sales_88 = calculateMagnitudeDistance(monthly, 'totSales', storeComparison=88, pre_year=2019, min_overlap=3)
mag_cust_88 = calculateMagnitudeDistance(monthly, 'nCustomers', storeComparison=88, pre_year=2019, min_overlap=3)

print("📊 Magnitude of Sales (Store 88 vs others)")
display(mag_sales_88.head())

print("📊 Magnitude of Customers (Store 88 vs others)")
display(mag_cust_88.head())
📊 Magnitude of Sales (Store 88 vs others)
Store1 Store2 mag_measure
0 88 237 0.955056
1 88 203 0.951909
2 88 40 0.930311
3 88 165 0.921560
4 88 199 0.915405
📊 Magnitude of Customers (Store 88 vs others)
Store1 Store2 mag_measure
0 88 237 0.985609
1 88 165 0.945211
2 88 203 0.938447
3 88 40 0.936907
4 88 4 0.936829

3.4 Decide 3 Control Stores Which are the Most Similar to Store 77, 86, and 88¶

In [51]:
# Define a function to calculate the final score using Pearson Correlation and Magnitude Distance
def build_final_scores_for_store(monthly, trial_store, pre_year=2019, corr_weight=0.5):
    """
    Combine scoreNSales, scoreNCust -> finalControlScore (the same weight)
    """
    # ---- scoreNSales:corr + mag ----
    corr_sales = calculateCorrelation(monthly, 'totSales', trial_store, pre_year=pre_year)
    mag_sales  = calculateMagnitudeDistance(monthly, 'totSales', trial_store, pre_year=pre_year)

    # corr maps to 0-1
    corr_sales['corr_sales'] = (corr_sales['corr_measure'] + 1) / 2
    sales_scores = corr_sales[['Store2','corr_sales']].merge(
        mag_sales[['Store2','mag_measure']].rename(columns={'mag_measure':'mag_sales'}),
        on='Store2', how='inner'
    )
    sales_scores['scoreNSales'] = corr_weight * sales_scores['corr_sales'] + (1 - corr_weight) * sales_scores['mag_sales']

    # ----scoreNCust:corr + mag ----
    corr_cust = calculateCorrelation(monthly, 'nCustomers', trial_store, pre_year=pre_year)
    mag_cust  = calculateMagnitudeDistance(monthly, 'nCustomers', trial_store, pre_year=pre_year)

    corr_cust['corr_cust'] = (corr_cust['corr_measure'] + 1) / 2
    cust_scores = corr_cust[['Store2','corr_cust']].merge(
        mag_cust[['Store2','mag_measure']].rename(columns={'mag_measure':'mag_cust'}),
        on='Store2', how='inner'
    )
    cust_scores['scoreNCust'] = corr_weight * cust_scores['corr_cust'] + (1 - corr_weight) * cust_scores['mag_cust']

    # ---- Combine the 2 scores and calculate the final score ----
    final_tbl = sales_scores.merge(cust_scores, on='Store2', how='inner')
    final_tbl['finalControlScore'] = final_tbl[['scoreNSales','scoreNCust']].mean(axis=1)

    # Arrange columns & sorting
    final_tbl.insert(0, 'Store1', trial_store)
    final_tbl = final_tbl[
        ['Store1','Store2',
         'corr_sales','mag_sales','scoreNSales',
         'corr_cust','mag_cust','scoreNCust',
         'finalControlScore']
    ].sort_values('finalControlScore', ascending=False, ignore_index=True)

    return final_tbl


final_scores_77 = build_final_scores_for_store(monthly, trial_store=77, pre_year=2019, corr_weight=0.5)

print('Final Score of store 77')
display(final_scores_77.head(5))
Final Score of store 77
Store1 Store2 corr_sales mag_sales scoreNSales corr_cust mag_cust scoreNCust finalControlScore
0 77 233 0.934965 0.986204 0.960584 0.995792 0.991569 0.993681 0.977133
1 77 41 0.855058 0.963798 0.909428 0.888857 0.972096 0.930476 0.919952
2 77 115 0.871641 0.936227 0.903934 0.876298 0.973703 0.925001 0.914467
3 77 17 0.924643 0.881021 0.902832 0.861972 0.962979 0.912475 0.907654
4 77 254 0.772443 0.919807 0.846125 0.980910 0.931704 0.956307 0.901216
In [52]:
final_scores_86 = build_final_scores_for_store(monthly, trial_store=86, pre_year=2019, corr_weight=0.5)

print('Final Score of store 86')
display(final_scores_86.head(5))
Final Score of store 86
Store1 Store2 corr_sales mag_sales scoreNSales corr_cust mag_cust scoreNCust finalControlScore
0 86 155 0.938052 0.963593 0.950822 0.977405 0.986277 0.981841 0.966332
1 86 138 0.925482 0.933794 0.929638 0.897366 0.927431 0.912398 0.921018
2 86 109 0.887606 0.959268 0.923437 0.851064 0.964175 0.907620 0.915528
3 86 114 0.867770 0.923488 0.895629 0.910865 0.936148 0.923506 0.909568
4 86 147 0.814965 0.899335 0.857150 0.973238 0.904519 0.938879 0.898014
In [53]:
final_scores_88 = build_final_scores_for_store(monthly, trial_store=88, pre_year=2019, corr_weight=0.5)

print('Final Score of store 88')
display(final_scores_88.head(5))
Final Score of store 88
Store1 Store2 corr_sales mag_sales scoreNSales corr_cust mag_cust scoreNCust finalControlScore
0 88 238 0.770645 0.856352 0.813499 0.794225 0.892556 0.843390 0.828445
1 88 237 0.392804 0.955056 0.673930 0.950660 0.985609 0.968135 0.821032
2 88 178 0.738132 0.702536 0.720334 0.941284 0.824132 0.882708 0.801521
3 88 69 0.610566 0.714853 0.662710 0.968909 0.869420 0.919164 0.790937
4 88 201 0.710564 0.869780 0.790172 0.600401 0.922343 0.761372 0.775772

Here we choose:

store 233 as the control store to store 77

store 155 as the control store to store 86

store 238 as the control store to store 88

3.5 Visualization Before Trial -?怎么样一行展示两张图¶

In [54]:
# Define a creating chart function
def plot_pretrial_trends(monthly, trial_store, control_store, pre_year=2019):
    """
    Visualize trial store vs control store vs other stores performance in pre-trial
    return:totSales & nCustomers chart
    """

    monthly['TransactionMonth'] = monthly['YEARMONTH'].dt.to_timestamp()

    # only pre-trial
    df = monthly[monthly['TransactionMonth'].dt.year < pre_year].copy()

    # Tag store type:Trial / Control / Other stores
    df['Store_type'] = np.where(df['STORE_NBR'] == trial_store, 'Trial Store',
                         np.where(df['STORE_NBR'] == control_store, 'Control Store', 'Other stores'))

    # Calculate mean of total sales and customers by month and store type
    sales_plot_df = (df.groupby(['TransactionMonth','Store_type'])['totSales']
                       .mean().reset_index())
    cust_plot_df  = (df.groupby(['TransactionMonth','Store_type'])['nCustomers']
                       .mean().reset_index())

    # Color map
    color_map = {
        'Control Store': 'lightblue', 
        'Trial Store':   'orange', 
        'Other stores':  '#7F8C8D'  
    }
    
    # Create subplots
    fig = make_subplots(rows=1, cols=2, subplot_titles=("Sales Trend","Customer Trend"))

    # Sales chart
    for store in sales_plot_df['Store_type'].unique():
        df_temp = sales_plot_df[sales_plot_df['Store_type']==store]
        fig.add_trace(
            go.Scatter(x=df_temp['TransactionMonth'], y=df_temp['totSales'],
                       mode='lines+markers', name = store,
                       line=dict(color=color_map[store])),
            row=1, col=1
        )

    # Customer chart
    for store in cust_plot_df['Store_type'].unique():
        df_temp = cust_plot_df[cust_plot_df['Store_type']==store]
        fig.add_trace(
            go.Scatter(x=df_temp['TransactionMonth'], y=df_temp['nCustomers'],
                       mode='lines+markers', name = store, showlegend = False,
                       line=dict(color=color_map[store])),
            row=1, col=2
        )

    fig.update_layout(title_text=f"Pre-trial Trends — Trial {trial_store} vs Control {control_store}",
                      legend=dict(orientation="h", yanchor="bottom", y=1.05, xanchor="center", x=0.5))

    return fig

fig77 = plot_pretrial_trends(monthly, trial_store=77, control_store=233)
fig77.show()
In [55]:
fig86 = plot_pretrial_trends(monthly, trial_store=86, control_store=155)
fig86.show()
In [56]:
fig88 = plot_pretrial_trends(monthly, trial_store=88, control_store=238)
fig88.show()

3.6 Hypothesis testing: If There are Significant Differences Between Trial Stores and Control Stores¶

3.6.1 Sales Percentage Differences¶

In [57]:
# Calculate scale factor: minimize scale residuals
def get_scaling_factor(monthly, trial_store, control_store, metric_col, pre_year=2019):
    monthly['TransactionMonth'] = monthly['YEARMONTH'].dt.to_timestamp()

    # Only need pre-trial
    pre_trial = monthly[monthly['TransactionMonth'].dt.year < pre_year].copy()

    # trial & control stores' total sales in pre-trial
    trial_pre_total   = pre_trial.loc[pre_trial['STORE_NBR']==trial_store, metric_col].sum()
    control_pre_total = pre_trial.loc[pre_trial['STORE_NBR']==control_store, metric_col].sum()

    # calculate scale factor
    scale = trial_pre_total / control_pre_total if control_pre_total != 0 else np.nan
    return scale

scale_77vs233 = get_scaling_factor(monthly, trial_store=77, control_store=233, metric_col = 'totSales')
print("Scaling factor (77 vs 233):", scale_77vs233)
scale_86vs155 = get_scaling_factor(monthly, trial_store=86, control_store=155, metric_col = 'totSales')
print("Scaling factor (86 vs 155):", scale_86vs155)
scale_88vs238 = get_scaling_factor(monthly, trial_store=88, control_store=238, metric_col = 'totSales')
print("Scaling factor (88 vs 238):", scale_88vs238)
Scaling factor (77 vs 233): 1.0082979154017406
Scaling factor (86 vs 155): 0.9713568024143832
Scaling factor (88 vs 238): 1.1669182444185682
In [58]:
# Calculate percentageDiff using trial store data and (control store data * scale factor) in 2019-01 ~ 2019-06
def percentage_diff(monthly: pd.DataFrame, 
                    trial_store: int,
                    control_store: int,
                    metric_col):
    df = monthly.copy()
    df['TransactionMonth'] = df['YEARMONTH'].dt.to_timestamp()

    scale = get_scaling_factor(monthly, trial_store, control_store, metric_col, pre_year=2019)

    # Only need trial store and control store
    sub = df[df['STORE_NBR'].isin([trial_store, control_store])].copy()
    sub['scaled'] = sub[metric_col].astype(float)
    sub.loc[sub['STORE_NBR'] == control_store, 'scaled'] *= scale

    # Only need data in 2019-01 and 2019-06
    mask = (sub['TransactionMonth'] >= '2019-01') & (sub['TransactionMonth'] <= '2019-06')
    tp = sub[mask]

    trial_series = tp[tp['STORE_NBR'] == trial_store][['TransactionMonth', metric_col]].rename(columns = {metric_col: 'trial'})
    control_series = tp[tp['STORE_NBR'] == control_store][['TransactionMonth', 'scaled']].rename(columns = {'scaled': 'control_scaled'})

    # Combine trial and control using YEARMONTH
    out = trial_series.merge(control_series, on = 'TransactionMonth', how = 'inner')
    out['scaling_factor'] = scale

    # Calculate percentage difference abs
    out['percentageDiff'] = (out['trial'] - out['control_scaled']).abs() / out['control_scaled']

    return out.sort_values('TransactionMonth').reset_index(drop = True)

print('PercentageDiff of sales between trial store 77 and control store 233:')
percent_sales_77vs233 = percentage_diff(monthly, trial_store = 77, control_store = 233, metric_col = 'totSales')
percent_sales_77vs233
PercentageDiff of sales between trial store 77 and control store 233:
Out[58]:
TransactionMonth trial control_scaled scaling_factor percentageDiff
0 2019-01-01 204.4 178.972880 1.008298 0.142072
1 2019-02-01 235.0 246.024691 1.008298 0.044811
2 2019-03-01 278.5 200.752115 1.008298 0.387283
3 2019-04-01 263.5 159.916049 1.008298 0.647740
4 2019-05-01 299.3 347.257802 1.008298 0.138104
5 2019-06-01 264.7 222.833839 1.008298 0.187881
In [59]:
print('PercentageDiff of sales between trial store 86 and control store 155:')
percent_sales_86vs155 = percentage_diff(monthly, trial_store = 86, control_store = 155, metric_col = 'totSales')
percent_sales_86vs155
PercentageDiff of sales between trial store 86 and control store 155:
Out[59]:
TransactionMonth trial control_scaled scaling_factor percentageDiff
0 2019-01-01 841.4 849.548659 0.971357 0.009592
1 2019-02-01 913.2 865.673182 0.971357 0.054902
2 2019-03-01 1026.8 781.359412 0.971357 0.314120
3 2019-04-01 848.2 820.407955 0.971357 0.033876
4 2019-05-01 889.3 896.416625 0.971357 0.007939
5 2019-06-01 838.0 832.647051 0.971357 0.006429
In [60]:
print('PercentageDiff of sales between trial store 88 and control store 238:')
percent_sales_88vs238 = percentage_diff(monthly, trial_store = 88, control_store = 238, metric_col = 'totSales')
percent_sales_88vs238
PercentageDiff of sales between trial store 88 and control store 238:
Out[60]:
TransactionMonth trial control_scaled scaling_factor percentageDiff
0 2019-01-01 1266.40 1489.104372 1.166918 0.149556
1 2019-02-01 1370.20 1188.389540 1.166918 0.152989
2 2019-03-01 1477.20 1108.455640 1.166918 0.332665
3 2019-04-01 1439.40 1379.297365 1.166918 0.043575
4 2019-05-01 1308.25 1465.299240 1.166918 0.107179
5 2019-06-01 1354.60 1248.835905 1.166918 0.084690

3.6.2 Sales - T-test¶

In [61]:
# Define a function to apply t-test and return t_stat, significance
def t_test(monthly, trial_store, control_store, metric_col):
    scale = get_scaling_factor(monthly, trial_store, control_store, metric_col, pre_year=2019)

    # calculate base using pre-trial percentageDiff
    df = monthly.copy()
    df['TransactionMonth'] = df['YEARMONTH'].dt.to_timestamp() 
    sub = df[df['STORE_NBR'].isin([trial_store, control_store])].copy()
    sub['scaled'] = sub[metric_col].astype(float)
    sub.loc[sub['STORE_NBR']==control_store, 'scaled'] *= scale

    pre = sub[sub['TransactionMonth'] < '2019-01-01'] \
            .pivot(index='TransactionMonth', columns='STORE_NBR', values=['scaled', metric_col])

    pre_diff = (pre[(metric_col, trial_store)] - pre[('scaled', control_store)]).abs() / pre[('scaled', control_store)]
    pre_diff = pre_diff.dropna()

    mu = pre_diff.mean()
    s  = pre_diff.std(ddof=1)
    n  = len(pre_diff)
    dfree = n - 1
    tcrit = t.ppf(0.975, dfree) if dfree > 0 else np.nan

    #percentageDiff of trial period
    out = percentage_diff(monthly, trial_store, control_store, metric_col)

    # calculate t-value and significance
    if s == 0 or np.isnan(s):
        out['t_stat'] = np.nan
        out['significant_95pct'] = False
    else:
        out['t_stat'] = (out['percentageDiff'] - mu) / s
        out['significant_95pct'] = out['t_stat'].abs() > tcrit

    # significant band information
    out['lower_95_band'] = mu - tcrit * s
    out['upper_95_band'] = mu + tcrit * s
    out['pre_mean'] = mu
    out['pre_sd'] = s
    out['df'] = dfree
    out['tcrit_95'] = tcrit
    out['scaling_factor'] = scale
    return out

test_77vs233 = t_test(monthly, trial_store=77, control_store=233, metric_col='totSales')
test_77vs233
Out[61]:
TransactionMonth trial control_scaled scaling_factor percentageDiff t_stat significant_95pct lower_95_band upper_95_band pre_mean pre_sd df tcrit_95
0 2019-01-01 204.4 178.972880 1.008298 0.142072 1.266146 False -0.064364 0.212258 0.073947 0.053805 5 2.570582
1 2019-02-01 235.0 246.024691 1.008298 0.044811 -0.541503 False -0.064364 0.212258 0.073947 0.053805 5 2.570582
2 2019-03-01 278.5 200.752115 1.008298 0.387283 5.823511 True -0.064364 0.212258 0.073947 0.053805 5 2.570582
3 2019-04-01 263.5 159.916049 1.008298 0.647740 10.664230 True -0.064364 0.212258 0.073947 0.053805 5 2.570582
4 2019-05-01 299.3 347.257802 1.008298 0.138104 1.192396 False -0.064364 0.212258 0.073947 0.053805 5 2.570582
5 2019-06-01 264.7 222.833839 1.008298 0.187881 2.117514 False -0.064364 0.212258 0.073947 0.053805 5 2.570582
In [62]:
test_86vs155 = t_test(monthly, trial_store=86, control_store=155, metric_col='totSales')
test_86vs155
Out[62]:
TransactionMonth trial control_scaled scaling_factor percentageDiff t_stat significant_95pct lower_95_band upper_95_band pre_mean pre_sd df tcrit_95
0 2019-01-01 841.4 849.548659 0.971357 0.009592 -0.664567 False -0.042692 0.098335 0.027821 0.027431 5 2.570582
1 2019-02-01 913.2 865.673182 0.971357 0.054902 0.987208 False -0.042692 0.098335 0.027821 0.027431 5 2.570582
2 2019-03-01 1026.8 781.359412 0.971357 0.314120 10.437046 True -0.042692 0.098335 0.027821 0.027431 5 2.570582
3 2019-04-01 848.2 820.407955 0.971357 0.033876 0.220714 False -0.042692 0.098335 0.027821 0.027431 5 2.570582
4 2019-05-01 889.3 896.416625 0.971357 0.007939 -0.724819 False -0.042692 0.098335 0.027821 0.027431 5 2.570582
5 2019-06-01 838.0 832.647051 0.971357 0.006429 -0.779871 False -0.042692 0.098335 0.027821 0.027431 5 2.570582
In [63]:
test_88vs238 = t_test(monthly, trial_store=88, control_store=238, metric_col='totSales')
test_88vs238
Out[63]:
TransactionMonth trial control_scaled scaling_factor percentageDiff t_stat significant_95pct lower_95_band upper_95_band pre_mean pre_sd df tcrit_95
0 2019-01-01 1266.40 1489.104372 1.166918 0.149556 2.922470 True -0.039445 0.137448 0.049002 0.034407 5 2.570582
1 2019-02-01 1370.20 1188.389540 1.166918 0.152989 3.022246 True -0.039445 0.137448 0.049002 0.034407 5 2.570582
2 2019-03-01 1477.20 1108.455640 1.166918 0.332665 8.244280 True -0.039445 0.137448 0.049002 0.034407 5 2.570582
3 2019-04-01 1439.40 1379.297365 1.166918 0.043575 -0.157724 False -0.039445 0.137448 0.049002 0.034407 5 2.570582
4 2019-05-01 1308.25 1465.299240 1.166918 0.107179 1.690842 False -0.039445 0.137448 0.049002 0.034407 5 2.570582
5 2019-06-01 1354.60 1248.835905 1.166918 0.084690 1.037236 False -0.039445 0.137448 0.049002 0.034407 5 2.570582

3.6.3 Customers Percentage Differences¶

Here we can directly use get_scaling_factor function and percentage_diff function.

In [64]:
# Calculate scaling factor of customers
scale_cust_77vs233 = get_scaling_factor(monthly, trial_store=77, control_store=233, metric_col = 'nCustomers')
print("Scaling factor (77 vs 233):", scale_cust_77vs233)
scale_cust_86vs155 = get_scaling_factor(monthly, trial_store=86, control_store=155, metric_col = 'nCustomers')
print("Scaling factor (86 vs 155):", scale_cust_86vs155)
scale_cust_88vs238 = get_scaling_factor(monthly, trial_store=88, control_store=238, metric_col = 'nCustomers')
print("Scaling factor (88 vs 238):", scale_cust_88vs238)
Scaling factor (77 vs 233): 1.0038022813688212
Scaling factor (86 vs 155): 1.0033277870216306
Scaling factor (88 vs 238): 1.1187683284457477
In [65]:
# Calculate percentageDiff
print('PercentageDiff of customers between trial store 77 and control store 233:')
percent_cust_77vs233 = percentage_diff(monthly, trial_store = 77, control_store = 233, metric_col = 'nCustomers')
percent_cust_77vs233
PercentageDiff of customers between trial store 77 and control store 233:
Out[65]:
TransactionMonth trial control_scaled scaling_factor percentageDiff
0 2019-01-01 35 35.133080 1.003802 0.003788
1 2019-02-01 45 45.171103 1.003802 0.003788
2 2019-03-01 50 40.152091 1.003802 0.245265
3 2019-04-01 47 30.114068 1.003802 0.560732
4 2019-05-01 55 57.216730 1.003802 0.038743
5 2019-06-01 41 41.155894 1.003802 0.003788
In [66]:
print('PercentageDiff of customers between trial store 86 and control store 155:')
percent_cust_86vs155 = percentage_diff(monthly, trial_store = 86, control_store = 155, metric_col = 'nCustomers')
percent_cust_86vs155
PercentageDiff of customers between trial store 86 and control store 155:
Out[66]:
TransactionMonth trial control_scaled scaling_factor percentageDiff
0 2019-01-01 94 96.319468 1.003328 0.024081
1 2019-02-01 107 95.316140 1.003328 0.122580
2 2019-03-01 115 94.312812 1.003328 0.219347
3 2019-04-01 105 99.329451 1.003328 0.057088
4 2019-05-01 104 106.352745 1.003328 0.022122
5 2019-06-01 98 95.316140 1.003328 0.028157
In [67]:
print('PercentageDiff of customers between trial store 88 and control store 238:')
percent_cust_88vs238 = percentage_diff(monthly, trial_store = 88, control_store = 238, metric_col = 'nCustomers')
percent_cust_88vs238
PercentageDiff of customers between trial store 88 and control store 238:
Out[67]:
TransactionMonth trial control_scaled scaling_factor percentageDiff
0 2019-01-01 117 135.370968 1.118768 0.135708
1 2019-02-01 124 115.233138 1.118768 0.076079
2 2019-03-01 134 101.807918 1.118768 0.316204
3 2019-04-01 128 126.420821 1.118768 0.012491
4 2019-05-01 128 129.777126 1.118768 0.013694
5 2019-06-01 121 112.995601 1.118768 0.070838

3.6.4 Customers T-test¶

In [68]:
test_cust_77vs233 = t_test(monthly, trial_store=77, control_store=233, metric_col='nCustomers')
test_cust_77vs233
Out[68]:
TransactionMonth trial control_scaled scaling_factor percentageDiff t_stat significant_95pct lower_95_band upper_95_band pre_mean pre_sd df tcrit_95
0 2019-01-01 35 35.133080 1.003802 0.003788 -0.996941 False -0.02479 0.068576 0.021893 0.018161 5 2.570582
1 2019-02-01 45 45.171103 1.003802 0.003788 -0.996941 False -0.02479 0.068576 0.021893 0.018161 5 2.570582
2 2019-03-01 50 40.152091 1.003802 0.245265 12.299783 True -0.02479 0.068576 0.021893 0.018161 5 2.570582
3 2019-04-01 47 30.114068 1.003802 0.560732 29.670693 True -0.02479 0.068576 0.021893 0.018161 5 2.570582
4 2019-05-01 55 57.216730 1.003802 0.038743 0.927814 False -0.02479 0.068576 0.021893 0.018161 5 2.570582
5 2019-06-01 41 41.155894 1.003802 0.003788 -0.996941 False -0.02479 0.068576 0.021893 0.018161 5 2.570582
In [69]:
test_cust_86vs155 = t_test(monthly, trial_store=86, control_store=155, metric_col='nCustomers')
test_cust_86vs155
Out[69]:
TransactionMonth trial control_scaled scaling_factor percentageDiff t_stat significant_95pct lower_95_band upper_95_band pre_mean pre_sd df tcrit_95
0 2019-01-01 94 96.319468 1.003328 0.024081 0.991180 False -0.013363 0.040685 0.013661 0.010513 5 2.570582
1 2019-02-01 107 95.316140 1.003328 0.122580 10.360673 True -0.013363 0.040685 0.013661 0.010513 5 2.570582
2 2019-03-01 115 94.312812 1.003328 0.219347 19.565351 True -0.013363 0.040685 0.013661 0.010513 5 2.570582
3 2019-04-01 105 99.329451 1.003328 0.057088 4.130922 True -0.013363 0.040685 0.013661 0.010513 5 2.570582
4 2019-05-01 104 106.352745 1.003328 0.022122 0.804845 False -0.013363 0.040685 0.013661 0.010513 5 2.570582
5 2019-06-01 98 95.316140 1.003328 0.028157 1.378945 False -0.013363 0.040685 0.013661 0.010513 5 2.570582
In [70]:
test_cust_88vs238 = t_test(monthly, trial_store=88, control_store=238, metric_col='nCustomers')
test_cust_88vs238 
Out[70]:
TransactionMonth trial control_scaled scaling_factor percentageDiff t_stat significant_95pct lower_95_band upper_95_band pre_mean pre_sd df tcrit_95
0 2019-01-01 117 135.370968 1.118768 0.135708 2.343612 False -0.024703 0.143117 0.059207 0.032643 5 2.570582
1 2019-02-01 124 115.233138 1.118768 0.076079 0.516885 False -0.024703 0.143117 0.059207 0.032643 5 2.570582
2 2019-03-01 134 101.807918 1.118768 0.316204 7.873078 True -0.024703 0.143117 0.059207 0.032643 5 2.570582
3 2019-04-01 128 126.420821 1.118768 0.012491 -1.431122 False -0.024703 0.143117 0.059207 0.032643 5 2.570582
4 2019-05-01 128 129.777126 1.118768 0.013694 -1.394292 False -0.024703 0.143117 0.059207 0.032643 5 2.570582
5 2019-06-01 121 112.995601 1.118768 0.070838 0.356322 False -0.024703 0.143117 0.059207 0.032643 5 2.570582

3.6.5 Visualization of Sales Performance¶

In [71]:
# define a function to visualize control store performance and trial store performance during the whole period
def plot_full(monthly,
             trial_store,
             control_store,
             metric_col,
             trial_start = '2019-01-01',
             trial_end = '2019-06-30'):
    df = monthly[monthly['STORE_NBR'].isin([trial_store, control_store])].copy()

    if 'TransactionMonth' not in monthly.columns:
        monthly['_YM'] = pd.PeriodIndex(monthly['YEARMONTH'].astype(str), freq='M')
        monthly['TransactionMonth'] = monthly['_YM'].dt.to_timestamp()

    # Calculate scale factor
    scale = get_scaling_factor(monthly, trial_store, control_store, metric_col, pre_year=2019)

    # Set trial series and control series
    trial_series = df[df['STORE_NBR'] == trial_store][['TransactionMonth', metric_col]].rename(columns = {metric_col: 'trial'})
    control_series = df[df['STORE_NBR'] == control_store][['TransactionMonth', metric_col]].rename(columns = {metric_col: 'control'})
    full = trial_series.merge(control_series, on = 'TransactionMonth', how = 'inner').sort_values('TransactionMonth')
    full['control_scaled'] = full['control'] * scale

    # Create confidence band
    pre_full = full[full['TransactionMonth'] < pd.to_datetime(trial_start)]
    pctdiff_pre = (pre_full['trial'] - pre_full['control_scaled']).abs() / pre_full['control_scaled']
    mu = pctdiff_pre.mean()
    sd = pctdiff_pre.std(ddof = 1)
    dof = max(len(pctdiff_pre) - 1, 1)
    tcrit = t.ppf(0.975, dof)
    band_low = mu - tcrit*sd
    band_high = mu + tcrit*sd

    # Turn % to abs
    full['band_low'] = full['control_scaled'] * (1 + band_low)
    full['band_high'] = full['control_scaled'] * (1 + band_high)

    # Plot
    fig = go.Figure()

    # Confidence band
    fig.add_trace(go.Scatter(x = full['TransactionMonth'], 
                             y = full['band_high'],
                             mode = 'lines', line = dict(width = 0),
                             name = 'Control 95% Upper',
                             hoverinfo = 'skip',
                             showlegend = False
                            ))
    fig.add_trace(go.Scatter(x = full['TransactionMonth'], 
                             y = full['band_low'],
                             mode = 'lines', line = dict(width = 0), 
                             fill = 'tonexty', fillcolor = 'rgba(135,206,250,0.3)',
                             name = 'Control 5% ~ 95% Band',
                             hoverinfo = 'skip'
                            ))

    # Add control_scaled and trial store
    fig.add_trace(go.Scatter(x = full['TransactionMonth'], y = full['control_scaled'],
                            mode = 'lines + markers',
                            line=dict(color='lightblue'),
                            marker=dict(color='lightblue'),
                            name=f'Control {control_store} (scaled)'))
    fig.add_trace(go.Scatter(x = full['TransactionMonth'], y = full['trial'],
                            mode = 'lines + markers',
                            line=dict(color='orange'),
                            marker=dict(color='orange'),
                            name=f'Trial {trial_store}'))

    # Highlight trial period
    fig.add_vrect(x0=pd.to_datetime(trial_start), x1=pd.to_datetime(trial_end),
                  fillcolor='rgba(160,160,160,0.15)', line_width=0, layer="below")

    pretty = {'totSales':'Sales', 'nCustomers':'Customers'}
    ylab = pretty.get(metric_col, metric_col)

    fig.update_layout(
        title=f'{ylab} over time — Trial {trial_store} vs Control {control_store}',
        xaxis_title='Month', yaxis_title = ylab, hovermode='x unified',
        legend=dict(
        orientation="h",      # adjust the legend position
        yanchor="bottom",   
        y=1.05,             
        xanchor="center",     
        x=0.5               
        ),
        xaxis=dict(
        dtick="M1",  # show every month
        tickformat="%b %Y" 
    ))

    return fig

full_sales_77vs233 = plot_full(monthly, trial_store=77, control_store=233, metric_col='totSales', trial_start='2019-01-01', trial_end='2019-06-30')
full_sales_77vs233.show()
full_cust_77vs233 = plot_full(monthly, trial_store=77, control_store=233, metric_col='nCustomers', trial_start='2019-01-01', trial_end='2019-06-30')
full_cust_77vs233.show()
In [72]:
full_sales_86vs155 = plot_full(monthly, trial_store=86, control_store=155, metric_col='totSales', trial_start='2019-01-01', trial_end='2019-06-30')
full_sales_86vs155.show()
full_cust_86vs155 = plot_full(monthly, trial_store=86, control_store=155, metric_col='nCustomers', trial_start='2019-01-01', trial_end='2019-06-30')
full_cust_86vs155.show()
In [73]:
full_sales_88vs238 = plot_full(monthly, trial_store=88, control_store=238, metric_col='totSales', trial_start='2019-01-01', trial_end='2019-06-30')
full_sales_88vs238.show()
full_cust_88vs238 = plot_full(monthly, trial_store=88, control_store=238, metric_col='nCustomers', trial_start='2019-01-01', trial_end='2019-06-30')
full_cust_88vs238.show()
In [74]:
# define a function to visualize percentage difference between trial stores and control stores, in only trial period.
def visual(monthly, trial_store, control_store, metric_col):
    df = t_test(monthly, trial_store, control_store, metric_col).copy()

    # lower band and higher band
    low = float(df['lower_95_band'].iloc[0])
    high = float(df['upper_95_band'].iloc[0])

    fig = go.Figure()

    # draw confidence band (5% ~ 95%)
    fig.add_trace(go.Scatter(
        x=df['TransactionMonth'], y=[high]*len(df),
        mode='lines', line=dict(width=0), showlegend=False, hoverinfo='skip'
    ))
    fig.add_trace(go.Scatter(
        x=df['TransactionMonth'], y=[low]*len(df),
        mode='lines', line=dict(width=0), fill='tonexty',
        fillcolor='rgba(173,216,230,0.3)', name='5%–95% band', hoverinfo='skip'
    ))

    # percentage diff line
    fig.add_trace(go.Scatter(
        x=df['TransactionMonth'], y=df['percentageDiff'],
        mode='lines+markers', name='percentageDiff',
        line=dict(color='orange'),  
        marker=dict(color='orange'),
        hovertemplate="Month=%{x|%Y-%m}<br>Diff=%{y:.3%}<br>t=%{customdata[0]:.3f}<br>p=%{customdata[1]:.4f}",
        customdata=np.c_[df['t_stat']]
    ))

    # mark significant points with red dot
    sig = df[df['significant_95pct']]
    if not sig.empty:
        fig.add_trace(go.Scatter(
            x=sig['TransactionMonth'], y=sig['percentageDiff'],
            mode='markers', marker=dict(size=10, symbol='circle', line=dict(width=1), color='#DAA520'),
            name='Significant (95%)'
        ))
    
    fig.update_layout(
        title=f"{metric_col} % difference of store {trial_store} vs store {control_store} (5% - 95% band shaded)",
        xaxis_title="Month", 
        yaxis_title="percentageDiff",
        yaxis_tickformat=".0%", 
        hovermode='x unified'
    )
    
    return fig

vis77vs233_sales = visual(monthly, trial_store = 77, control_store = 233, metric_col = 'totSales')
vis77vs233_sales.show()
vis77vs233_cust = visual(monthly, trial_store = 77, control_store = 233, metric_col = 'nCustomers')
vis77vs233_cust.show()
In [75]:
vis86vs155_sales = visual(monthly, trial_store = 86, control_store = 155, metric_col = 'totSales')
vis86vs155_sales.show()
vis86vs155_cust = visual(monthly, trial_store = 86, control_store = 155, metric_col = 'nCustomers')
vis86vs155_cust.show()
In [76]:
vis88vs238_sales = visual(monthly, trial_store = 88, control_store = 238, metric_col = 'totSales')
vis88vs238_sales.show()
vis88vs238_cust = visual(monthly, trial_store = 88, control_store = 238, metric_col = 'nCustomers')
vis88vs238_cust.show()

From the visualizations we can see that trial store performs very well in March and April, which means new store layout is effective. Total sales and number of customers increase significantly.

In [ ]: